library(beeswarm)
library(naniar)
library(zoo)

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric
# install.packages("zoo")
library(janitor)
library(dplyr)
# install.packages("GGally")
# library(sets)
library(tidyverse)
library(ggplot2)
library(GGally) # for ggpairs
# install.packages("maps")
# library(maps)
load_file <- function(file_path){
  read_csv(file_path)
}

tx_data <- load_file("./../data/COVID-19_cases_TX.csv")

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
  county_fips_code = col_character(),
  county_name = col_character(),
  state = col_character(),
  state_fips_code = col_double(),
  date = col_date(format = ""),
  confirmed_cases = col_double(),
  deaths = col_double()
)
global_mobility_report <- load_file("./../data/Global_Mobility_Report.csv")

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
  country_region_code = col_character(),
  country_region = col_character(),
  sub_region_1 = col_character(),
  sub_region_2 = col_logical(),
  metro_area = col_logical(),
  iso_3166_2_code = col_character(),
  census_fips_code = col_logical(),
  date = col_date(format = ""),
  retail_and_recreation_percent_change_from_baseline = col_double(),
  grocery_and_pharmacy_percent_change_from_baseline = col_double(),
  parks_percent_change_from_baseline = col_double(),
  transit_stations_percent_change_from_baseline = col_double(),
  workplaces_percent_change_from_baseline = col_double(),
  residential_percent_change_from_baseline = col_double()
)

4199216 parsing failures.
 row        col           expected                  actual                                   file
3036 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3037 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3038 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3039 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3040 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
.... .......... .................. ....................... ......................................
See problems(...) for more details.
cases_plus_census <- load_file("./../data/COVID-19_cases_plus_census.csv")

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_double(),
  county_fips_code = col_character(),
  county_name = col_character(),
  state = col_character(),
  state_fips_code = col_character(),
  date = col_date(format = ""),
  geo_id = col_character(),
  pop_5_years_over = col_logical(),
  speak_only_english_at_home = col_logical(),
  speak_spanish_at_home = col_logical(),
  speak_spanish_at_home_low_english = col_logical(),
  pop_15_and_over = col_logical(),
  pop_never_married = col_logical(),
  pop_now_married = col_logical(),
  pop_separated = col_logical(),
  pop_widowed = col_logical(),
  pop_divorced = col_logical()
)
ℹ Use `spec()` for the full column specifications.
cols_keep <- c("county_fips_code", "confirmed_cases", "deaths", "median_income", "male_pop", "female_pop", "total_pop", "median_age", "worked_at_home")
subset_census <- cases_plus_census[cols_keep]

cols_keep <- c("date", "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline")
subset_mobility <- global_mobility_report[cols_keep]
subset_mobility$date <- as.Date(subset_mobility$date, format="%Y-%m-%d")
# global_mobility_report
vis_miss(global_mobility_report, sort_miss = T, warn_large_data= F)

vis_miss(tx_data, sort_miss = T, warn_large_data= F)

vis_miss(subset_census, sort_miss = T, warn_large_data = F)

library(RColorBrewer)
plot_vs_county <- function(df, col_val, percentile=FALSE,
                           fips_title="county_fips_code", banks=6, 
                           legend_title="", graphic_title=""){
  # Subset for speed 
  df <- df[c(fips_title, col_val)]
  
  # Get county data
  gcounty <- ggplot2::map_data("county")
  # USA map data
  gusa <- map_data("state")
  
  if (banks > 9){
    mycolors <- colorRampPalette(brewer.pal(9, "Reds"))(banks)
  }
  
  # Format with subregions
  fipstab <-
      transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
      unique() %>%
      separate(county, c("region", "subregion"), sep = ",")
  
  # Combine in desired order (NA for missing)
  gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))


  dis <- df
  dis$rprop <- rank(df[col_val])
  dis$pcls <- cut(100 * percent_rank(df[col_val]), seq(0, 100, len = banks),
                        include.lowest = TRUE)

  # Missing data
  anti_join(gcounty, dis, by = c("fips" = fips_title)) %>%
    select(region, subregion) %>%
    unique()
  gcounty_pop <- left_join(gcounty, dis, by = c("fips" = fips_title))
  fill_vals <- gcounty_pop[col_val]

  # Plot
  if (legend_title == ""){
    legend_title <- col_val
  }

  if (percentile == FALSE){
    # names(gcounty_pop)[names(gcounty_pop) == col_val] <- "col_of_interest"
    plt <- ggplot(gcounty_pop) +
      geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
                   color = "grey", size = 0.1, name="Percent Infected") +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = gusa, color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()+
      scale_fill_gradient2()
       # scale_fill_gradient(low = "white", high = "red", na.value = "grey")
      # scale_fill_gradientn(colours = terrain.colors(10))
  }

  if (percentile == TRUE){
    plt <- ggplot(gcounty_pop) +
      geom_polygon(aes(long, lat, group = group, fill = pcls),
                   color = "grey", size = 0.1) +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = gusa, color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
      scale_fill_manual(values = mycolors, na.value = "grey") +
      # scale_fill_brewer(palette = "viridis", na.value = "grey") +
      theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
            legend.background = element_rect(fill = NA), 
            legend.position = "left")
  }
  plt <- plt + labs(fill=legend_title) + ggtitle(graphic_title)
  plt
}
subset_census
subset_census['pct_infected'] <- subset_census['confirmed_cases']/subset_census['total_pop']
subset_census['pct_deaths'] <- subset_census['deaths']/subset_census['total_pop']
subset_census$county_fips_code <-as.integer(subset_census$county_fips_code)
subset_census
plot_vs_county(subset_census, "pct_infected", legend_title = "Percent Infected")
Ignoring unknown parameters: name

plot_vs_county(subset_census, "pct_infected", percentile = TRUE, banks=11, 
               legend_title = "Percentile Infected",
               graphic_title = "Percentile of Percentage of People Infected by County")

plot_vs_county(subset_census, "pct_deaths", percentile = TRUE, banks=11, 
               legend_title = "Percentile Deaths",
               graphic_title = "Percentile of Percentage of Deaths by County")

census_corr_cols <- c("deaths", "confirmed_cases", "median_income", "male_pop",
                      "female_pop", "total_pop", "median_age", "worked_at_home")
ggcorr(subset_census[census_corr_cols], low="red", mid="grey", high="blue", hjust= .75, size=3, 
       label = TRUE, label_size = 3, label_color = "white") + ggplot2::labs(title = "Pearson Correlation of Important Variables")

country_date_pct_change <- global_mobility_report %>% select(country_region_code
                                                             | contains("date") 
                                                             | contains("percent"))
country_date_pct_change
# country_date_pct_change$retail_and_recreation_percent_change_from_baseline
test <- country_date_pct_change %>% filter(country_region_code == "AE") %>% select(date | retail_and_recreation_percent_change_from_baseline)
ggplot( data = test, aes( date, retail_and_recreation_percent_change_from_baseline )) + 
  # geom_line(aes(y=rollmean(retail_and_recreation_percent_change_from_baseline, k=2, na.pad=TRUE)))
  geom_line()

length(unique(country_date_pct_change$country_region_code))
[1] 135
# uniqueN(country_date_pct_change, by = c("country_region_code"))
xm <- zoo(matrix(1:12, 4, 3), x.Date[1:4])
Error in MATCH(order.by, order.by) : object 'x.Date' not found
---
title: "Cleaned Notebook"
output: html_notebook
---

```{r}
library(beeswarm)
library(naniar)
library(zoo)
# install.packages("zoo")
library(janitor)
library(dplyr)
# install.packages("GGally")
# library(sets)
library(tidyverse)
library(ggplot2)
library(GGally) # for ggpairs
# install.packages("maps")
# library(maps)
```

```{r}
load_file <- function(file_path){
  read_csv(file_path)
}

tx_data <- load_file("./../data/COVID-19_cases_TX.csv")
global_mobility_report <- load_file("./../data/Global_Mobility_Report.csv")
cases_plus_census <- load_file("./../data/COVID-19_cases_plus_census.csv")
```
```{r}
cols_keep <- c("county_fips_code", "confirmed_cases", "deaths", "median_income", "male_pop", "female_pop", "total_pop", "median_age", "worked_at_home")
subset_census <- cases_plus_census[cols_keep]

# cols_keep <- c("date", "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline")
# subset_mobility <- global_mobility_report[cols_keep]
# glo
# subset_mobility$date <- as.Date(subset_mobility$date, format="%Y-%m-%d")
```

```{r}
# global_mobility_report
vis_miss(global_mobility_report, sort_miss = T, warn_large_data= F)
```


```{r}
vis_miss(tx_data, sort_miss = T, warn_large_data= F)
vis_miss(subset_census, sort_miss = T, warn_large_data = F)
```

```{r}
library(RColorBrewer)
plot_vs_county <- function(df, col_val, percentile=FALSE,
                           fips_title="county_fips_code", banks=6, 
                           legend_title="", graphic_title=""){
  # Subset for speed 
  df <- df[c(fips_title, col_val)]
  
  # Get county data
  gcounty <- ggplot2::map_data("county")
  # USA map data
  gusa <- map_data("state")
  
  if (banks > 9){
    mycolors <- colorRampPalette(brewer.pal(9, "Reds"))(banks)
  }
  
  # Format with subregions
  fipstab <-
      transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
      unique() %>%
      separate(county, c("region", "subregion"), sep = ",")
  
  # Combine in desired order (NA for missing)
  gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))


  dis <- df
  dis$rprop <- rank(df[col_val])
  dis$pcls <- cut(100 * percent_rank(df[col_val]), seq(0, 100, len = banks),
                        include.lowest = TRUE)

  # Missing data
  anti_join(gcounty, dis, by = c("fips" = fips_title)) %>%
    select(region, subregion) %>%
    unique()
  gcounty_pop <- left_join(gcounty, dis, by = c("fips" = fips_title))
  fill_vals <- gcounty_pop[col_val]

  # Plot
  if (legend_title == ""){
    legend_title <- col_val
  }

  if (percentile == FALSE){
    # names(gcounty_pop)[names(gcounty_pop) == col_val] <- "col_of_interest"
    plt <- ggplot(gcounty_pop) +
      geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
                   color = "grey", size = 0.1, name="Percent Infected") +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = gusa, color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()+
      scale_fill_gradient2()
       # scale_fill_gradient(low = "white", high = "red", na.value = "grey")
      # scale_fill_gradientn(colours = terrain.colors(10))
  }

  if (percentile == TRUE){
    plt <- ggplot(gcounty_pop) +
      geom_polygon(aes(long, lat, group = group, fill = pcls),
                   color = "grey", size = 0.1) +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = gusa, color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
      scale_fill_manual(values = mycolors, na.value = "grey") +
      # scale_fill_brewer(palette = "viridis", na.value = "grey") +
      theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
            legend.background = element_rect(fill = NA), 
            legend.position = "left")
  }
  plt <- plt + labs(fill=legend_title) + ggtitle(graphic_title)
  plt
}
```

```{r}
subset_census
```

```{r}
subset_census['pct_infected'] <- subset_census['confirmed_cases']/subset_census['total_pop']
subset_census['pct_deaths'] <- subset_census['deaths']/subset_census['total_pop']
subset_census$county_fips_code <-as.integer(subset_census$county_fips_code)
subset_census
```
```{r}
plot_vs_county(subset_census, "pct_infected", legend_title = "Percent Infected")
plot_vs_county(subset_census, "pct_infected", percentile = TRUE, banks=11, 
               legend_title = "Percentile Infected",
               graphic_title = "Percentile of Percentage of People Infected by County")
plot_vs_county(subset_census, "pct_deaths", percentile = TRUE, banks=11, 
               legend_title = "Percentile Deaths",
               graphic_title = "Percentile of Percentage of Deaths by County")
```

```{r}
census_corr_cols <- c("deaths", "confirmed_cases", "median_income", "male_pop",
                      "female_pop", "total_pop", "median_age", "worked_at_home")
ggcorr(subset_census[census_corr_cols], low="red", mid="grey", high="blue", hjust= .75, size=3, 
       label = TRUE, label_size = 3, label_color = "white") + ggplot2::labs(title = "Pearson Correlation of Important Variables")
```


```{r}
country_date_pct_change <- global_mobility_report %>% select(country_region_code
                                                             | contains("date") 
                                                             | contains("percent"))
country_date_pct_change
```
```{r}
# country_date_pct_change$retail_and_recreation_percent_change_from_baseline
test <- country_date_pct_change %>% filter(country_region_code == "AE") %>% select(date | retail_and_recreation_percent_change_from_baseline)
ggplot( data = test, aes( date, retail_and_recreation_percent_change_from_baseline )) + 
  # geom_line(aes(y=rollmean(retail_and_recreation_percent_change_from_baseline, k=2, na.pad=TRUE)))
  geom_line()
```

```{r}

length(unique(country_date_pct_change$country_region_code))
# uniqueN(country_date_pct_change, by = c("country_region_code"))
```

```{r}
x <- 
```





















